**06/23/2021
**file to bring in CPD FOIA files received in 2021 & assign unique IDs
 

**Housekeepig
local ver_suff _9_21
 
local ver_suff _9_21
clear all
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
set more off

//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

*******************
*BRING IN AWARDS DATA
*******************
 
insheet using ../FOIA/FOIA_awards_05242021/17381-P662506-CPDAwards.csv

**Record name 
gen fname = trim(awardeefirstname)
gen lname = trim(AWARDEELASTNAME)
gen mi = trim(awardeemi)

**Record appointment date
gen appoint_dt = date(appointeddate, "DMY", 2025)
gen appoint_yr = year(appoint_dt)
gen appoint_m = month(appoint_dt)
gen appoint_d = day(appoint_dt)

**Record star number
destring star, replace force

//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)


**Save intermediate dataset
save ../processed_data`ver_suff'/FOIA_processing/awards, replace

 **Deduplicate by matchign variables
duplicates drop fname lname_mod mi appoint_yr appoint_m appoint_d birthyear , force
keep fname lname lname_mod mi appoint_yr appoint_m appoint_d birthyear sex race resignationdate


sort fname lname_mod mi appoint_yr appoint_m appoint_d birthyear 

**Generate preliminary ID
gen pid = _n 

**Make soundex of names
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)
 
**Save dataset with prelminary ID
save ../processed_data`ver_suff'/FOIA_processing/awards_pid, replace
clear


*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/awards_pid, clear // start with 29,354

**Perfect name, appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //19,978 so far

**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2  

**Perfect first name, soundex last name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3  

**Soundex both names, middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  

**Soundex both names, no middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  

**first name, no middle initial, no last name and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  

**first name, no middle initial, no last name and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7  


**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  


**first name, last name, off in birth year 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 12  

 
**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  
 
**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  
 
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no u_pid
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
tab dup

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'
 
*******************
*MERGE DE-DUPLICATED AWARDS DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/awards_pid, clear

merge 1:1 pid using `best_matches', keep(1 3)  //24,487 of 29354 are matched in awards

 


